{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 更大的context\n",
    "- 使用额外的context（语境/上下文）训练我们的语言模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "import torch.optim as optim\n",
    "\n",
    "import numpy as np\n",
    "from collections import Counter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_file = './data/bobsue.prevsent.train.tsv'\n",
    "dev_file = './data/bobsue.prevsent.dev.tsv'\n",
    "test_file = './data/bobsue.prevsent.test.tsv'\n",
    "word_file = './data/bobsue.voc.txt'\n",
    "\n",
    "BATCH_SIZE = 32       # 批次大小\n",
    "EMBEDDING_DIM = 200   # 词向量维度\n",
    "HIDDEN_DIM = 200      # 隐含层\n",
    "GRAD_CLIP = 5.        # 梯度截断值\n",
    "EPOCHS = 20\n",
    "LEARNING_RATE = 0.01     # 初始学习率\n",
    "\n",
    "BEST_VALID_ACC = 0.           # 初始验证集上的损失值，设为0\n",
    "MODEL_PATH = \"lm-large-cont-dim{}.pth\"   # 模型名称\n",
    "USE_CUDA = torch.cuda.is_available()     # 是否使用GPU\n",
    "NUM_CUDA = torch.cuda.device_count()     # GPU数量"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_word_set(path):\n",
    "    with open(path, 'r', encoding='utf-8') as f:\n",
    "        text = f.readlines()\n",
    "    words = [w.strip() for w in text]\n",
    "    return words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "words_set = read_word_set(word_file)\n",
    "word2idx = {w:i for i, w in enumerate(words_set, 1)}\n",
    "idx2word = {i:w for i, w in enumerate(words_set, 1)}\n",
    "# 设置 <pad> 值为 0\n",
    "PAD_IDX = 0\n",
    "idx2word[PAD_IDX] = '<pad>'\n",
    "word2idx['<pad>'] = PAD_IDX"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_corpus(path):\n",
    "    \"\"\"读取数据集，返回句子列表\"\"\"\n",
    "    contexts = []\n",
    "    target_sentences = []\n",
    "    with open(path, 'r', encoding='utf-8') as f:\n",
    "        for sentence in f.readlines():\n",
    "            sentence = sentence.strip()\n",
    "            context, target_sentence = sentence.split('\\t')\n",
    "            contexts.append(context)\n",
    "            target_sentences.append(target_sentence)\n",
    "    \n",
    "    return (contexts, target_sentences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_context, train_target = read_corpus(train_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(6036, 6036)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(train_context), len(train_target)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('<s> Sue realized she was really bored . </s>',\n",
       " '<s> She ate quickly and asked to be taken home . </s>')"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_context[0], train_target[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_context, train_target = read_corpus(train_file)\n",
    "dev_context, dev_target = read_corpus(dev_file)\n",
    "test_context, test_target = read_corpus(test_file)\n",
    "\n",
    "train_words = [w for s in train_context+train_target for w in s.split()]\n",
    "dev_words = [w for s in dev_context+dev_target for w in s.split()]\n",
    "test_words = [w for s in test_context+test_target for w in s.split()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "训练集集句子个数：750\n",
      "验证集句子个数：750\n",
      "测试集句子个数：750\n",
      "训练集集单词个数：139045\n",
      "验证集单词个数：16984\n",
      "测试集单词个数：17233\n"
     ]
    }
   ],
   "source": [
    "print(\"训练集集句子个数：{}\".format(len(test_context)))\n",
    "print(\"验证集句子个数：{}\".format(len(dev_context)))\n",
    "print(\"测试集句子个数：{}\".format(len(test_context)))\n",
    "\n",
    "print(\"训练集集单词个数：{}\".format(len(train_words)))\n",
    "print(\"验证集单词个数：{}\".format(len(dev_words)))\n",
    "print(\"测试集单词个数：{}\".format(len(test_words)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "训练集第二句最长句子长度为：21\n",
      "验证集第二句最长句子长度为：20\n",
      "测试集第二句最长句子长度为：21\n"
     ]
    }
   ],
   "source": [
    "print(\"训练集第二句最长句子长度为：{}\".format(max([len(s.split()) for s in train_target])))\n",
    "print(\"验证集第二句最长句子长度为：{}\".format(max([len(s.split()) for s in dev_target])))\n",
    "print(\"测试集第二句最长句子长度为：{}\".format(max([len(s.split()) for s in test_target])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def prepare_sequence(context, target, word2idx, seq_len=21):\n",
    "    \"\"\"输入语料句子列表，返回模型输入序列的idx\"\"\"\n",
    "    contexts = []\n",
    "    sentences = []\n",
    "    labels = []\n",
    "    \n",
    "    for c, t in zip(context,target):\n",
    "        c_words = c.split()\n",
    "        c_tample = [0] * seq_len\n",
    "        for i, w in enumerate(c_words):\n",
    "            c_tample[i] = word2idx[w]\n",
    "        contexts.append(c_tample)\n",
    "        \n",
    "            \n",
    "        t_words = t.split()\n",
    "        sentence_tample = [0] * seq_len\n",
    "        for i, w in enumerate(t_words[:-1]):\n",
    "            sentence_tample[i] = word2idx[w]\n",
    "        sentences.append(sentence_tample)\n",
    "        \n",
    "        target_tample = [0] * seq_len\n",
    "        for i, w in enumerate(t_words[1:]):\n",
    "            target_tample[i] = word2idx[w]\n",
    "        labels.append(target_tample)\n",
    "        \n",
    "    return contexts, sentences, labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_context, train_data, train_label = prepare_sequence(train_context, train_target, word2idx)\n",
    "dev_context, dev_data, dev_label = prepare_sequence(dev_context, dev_target, word2idx)\n",
    "test_context, test_data, test_label = prepare_sequence(test_context, test_target, word2idx)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<s> Sue realized she was really bored . </s> \n",
      "<s> She ate quickly and asked to be taken home . \n",
      "She ate quickly and asked to be taken home . </s> \n"
     ]
    }
   ],
   "source": [
    "idx = 0\n",
    "for i in train_context[idx]:\n",
    "    if i==0:\n",
    "        print()\n",
    "        break\n",
    "    print(idx2word[i], end=' ')\n",
    "    \n",
    "for i in train_data[idx]:\n",
    "    if i==0:\n",
    "        print()\n",
    "        break\n",
    "    print(idx2word[i], end=' ')\n",
    "    \n",
    "for i in train_label[idx]:\n",
    "    if i==0:\n",
    "        print()\n",
    "        break\n",
    "    print(idx2word[i], end=' ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_batch(context, data, label, batch_size=32):\n",
    "    \"\"\"\n",
    "    构建 batch tensor，返回 batch 列表，每个batch为二元组包含data和label\n",
    "   \n",
    "    \"\"\"\n",
    "    batch_data = []\n",
    "    context_tensor = torch.tensor(context, dtype=torch.long)\n",
    "    data_tensor = torch.tensor(data, dtype=torch.long)\n",
    "    label_tensor = torch.tensor(label, dtype=torch.long)\n",
    "    n, dim = data_tensor.size()\n",
    "    for start in range(0, n, batch_size):\n",
    "        end = start + batch_size\n",
    "        if end > n:\n",
    "            print(\"data not eq batch size.\")\n",
    "            break\n",
    "            cbatch = context_tensor[start: ]\n",
    "            dbatch = data_tensor[start: ]\n",
    "            lbatch = label_tensor[start: ]\n",
    "            print(batch.size())\n",
    "        else:\n",
    "            cbatch = context_tensor[start: end]\n",
    "            dbatch = data_tensor[start: end]\n",
    "            lbatch = label_tensor[start: end]\n",
    "        batch_data.append((cbatch, dbatch, lbatch))\n",
    "    return batch_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "data not eq batch size.\n",
      "data not eq batch size.\n",
      "data not eq batch size.\n"
     ]
    }
   ],
   "source": [
    "train_batch = get_batch(train_context, train_data, train_label, batch_size=BATCH_SIZE)\n",
    "dev_batch = get_batch(dev_context, dev_data, dev_label, batch_size=BATCH_SIZE)\n",
    "test_batch = get_batch(test_context, test_data, test_label, batch_size=BATCH_SIZE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "class LSTMLM(nn.Module):\n",
    "    def __init__(self, embedding_dim, hidden_dim, vocab_size):\n",
    "        super(LSTMLM, self).__init__()\n",
    "        self.hidden_dim = hidden_dim\n",
    "        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)\n",
    "        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)\n",
    "        self.hidden2word = nn.Linear(hidden_dim, vocab_size)\n",
    "        \n",
    "    def forward(self, context, data):\n",
    "        \n",
    "        # [batch_size, seq_len] ==> [batch_size, seq_len, embedding_dim]\n",
    "        context_embed = self.word_embeddings(context)\n",
    "        embeds = self.word_embeddings(data)\n",
    "        # [batch, seq_len, imput_size] ==> [batch, seq_len, hidden_size]\n",
    "        lstm_out, hidden = self.lstm(context_embed)\n",
    "        lstm_out, (h_n, c_n) = self.lstm(embeds, hidden)\n",
    "        # [batch, seq_len, hidden_size] ==> [batch*seq_len, vocab_size]\n",
    "        target_space = self.hidden2word(lstm_out.contiguous().view(-1, self.hidden_dim))\n",
    "        # 添加mask\n",
    "        mask = (data != PAD_IDX).view(-1)\n",
    "        # 获取 非pad 数据\n",
    "        mask_target = target_space[mask]\n",
    "        \n",
    "        target_scores = F.log_softmax(mask_target, dim=1)\n",
    "        return target_scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "def acc_score(y_hat, y):\n",
    "    # 返回最大的概率的索引\n",
    "    pred = y_hat.argmax(dim=1)\n",
    "    # print(y.view(-1))\n",
    "    acc_count = torch.eq(pred, y.view(-1))\n",
    "    score = acc_count.sum().item() / acc_count.size()[0]\n",
    "    return score\n",
    "\n",
    "def train(model, device, iterator, optimizer, criterion, grad_clip):\n",
    "    epoch_loss = 0  # 积累变量\n",
    "    epoch_acc = 0   # 积累变量\n",
    "    model.train()   # 该函数表示PHASE=Train\n",
    "    \n",
    "    for c, x, y in iterator:  # 拿每一个minibatch\n",
    "        c = c.to(device)\n",
    "        x = x.to(device)\n",
    "        y = y.to(device)\n",
    "        \n",
    "        optimizer.zero_grad()\n",
    "        mask = y != PAD_IDX\n",
    "        pure_y = y[mask]\n",
    "        \n",
    "        fx = model(c, x)                 # 进行forward\n",
    "        loss = criterion(fx, pure_y)  # 计算loss\n",
    "        acc = acc_score(fx, pure_y)   # 计算准确率\n",
    "        loss.backward()               # 进行BP\n",
    "        \n",
    "        # 梯度裁剪\n",
    "        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)\n",
    "        optimizer.step()  # 更新参数\n",
    "        \n",
    "        epoch_loss += loss\n",
    "        epoch_acc += acc\n",
    "        \n",
    "    return epoch_loss/len(iterator),epoch_acc/len(iterator)\n",
    "\n",
    "def evaluate(model, device, iterator, criterion):\n",
    "    model.eval()  # 不更新参数，预测模式\n",
    "    epoch_loss=0  # 积累变量\n",
    "    epoch_acc=0   # 积累变量\n",
    "    \n",
    "    with torch.no_grad():\n",
    "        for c, x, y in iterator:\n",
    "            c = c.to(device)\n",
    "            x = x.to(device)\n",
    "            y = y.to(device)\n",
    "            mask = y != PAD_IDX\n",
    "            pure_y = y[mask]\n",
    "            \n",
    "            fx = model(c, x)\n",
    "            loss = criterion(fx, pure_y)\n",
    "            acc = acc_score(fx, pure_y)\n",
    "            epoch_loss += loss\n",
    "            epoch_acc += acc\n",
    "    return epoch_loss/len(iterator), epoch_acc/len(iterator)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0, 1, 2, 3]\n"
     ]
    }
   ],
   "source": [
    "model = LSTMLM(EMBEDDING_DIM, HIDDEN_DIM, len(word2idx))\n",
    "# 使用GPU\n",
    "DEVICE = torch.device(\"cuda\" if USE_CUDA else 'cpu')\n",
    "model = model.to(DEVICE)\n",
    "if NUM_CUDA > 1:\n",
    "    device_ids = list(range(NUM_CUDA))\n",
    "    print(device_ids)\n",
    "    model = nn.DataParallel(model, device_ids=device_ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/root/anaconda3/lib/python3.6/site-packages/torch/nn/modules/rnn.py:179: RuntimeWarning: RNN module weights are not part of single contiguous chunk of memory. This means they need to be compacted at every call, possibly greatly increasing memory usage. To compact weights again call flatten_parameters().\n",
      "  self.dropout, self.training, self.bidirectional, self.batch_first)\n",
      "/root/anaconda3/lib/python3.6/site-packages/torch/serialization.py:251: UserWarning: Couldn't retrieve source code for container of type LSTMLM. It won't be checked for correctness upon loading.\n",
      "  \"type \" + obj.__name__ + \". It won't be checked \"\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Save model:  lm-large-cont-dim200.pth\n",
      "Epoch:1|Train Loss:3.98225|Train Acc:0.279988|Val Loss:3.53932|Val Acc:0.318935\n",
      "Save model:  lm-large-cont-dim200.pth\n",
      "Epoch:2|Train Loss:3.29483|Train Acc:0.325456|Val Loss:3.50593|Val Acc:0.322937\n",
      "Current lr: 0.01\n",
      "Epoch:3|Train Loss:2.9768|Train Acc:0.346684|Val Loss:3.55512|Val Acc:0.320301\n",
      "Current lr: 0.005\n",
      "Epoch:4|Train Loss:2.73913|Train Acc:0.369789|Val Loss:3.65205|Val Acc:0.318491\n",
      "Save model:  lm-large-cont-dim200.pth\n",
      "Epoch:5|Train Loss:2.33257|Train Acc:0.438309|Val Loss:3.66164|Val Acc:0.324247\n",
      "Current lr: 0.0025\n",
      "Epoch:6|Train Loss:2.07493|Train Acc:0.490698|Val Loss:3.74153|Val Acc:0.319476\n",
      "Save model:  lm-large-cont-dim200.pth\n",
      "Epoch:7|Train Loss:1.81699|Train Acc:0.552415|Val Loss:3.76917|Val Acc:0.326346\n",
      "Current lr: 0.00125\n",
      "Epoch:8|Train Loss:1.69411|Train Acc:0.582961|Val Loss:3.82684|Val Acc:0.325274\n",
      "Current lr: 0.000625\n",
      "Epoch:9|Train Loss:1.56808|Train Acc:0.616432|Val Loss:3.85215|Val Acc:0.3233\n",
      "Current lr: 0.0003125\n",
      "Epoch:10|Train Loss:1.49412|Train Acc:0.637624|Val Loss:3.87085|Val Acc:0.323787\n",
      "Save model:  lm-large-cont-dim200.pth\n",
      "Epoch:11|Train Loss:1.45339|Train Acc:0.649028|Val Loss:3.88107|Val Acc:0.326846\n",
      "Current lr: 0.00015625\n",
      "Epoch:12|Train Loss:1.43826|Train Acc:0.652809|Val Loss:3.88965|Val Acc:0.32667\n",
      "Save model:  lm-large-cont-dim200.pth\n",
      "Epoch:13|Train Loss:1.41802|Train Acc:0.659076|Val Loss:3.89433|Val Acc:0.329787\n",
      "Save model:  lm-large-cont-dim200.pth\n",
      "Epoch:14|Train Loss:1.41019|Train Acc:0.664199|Val Loss:3.89817|Val Acc:0.330222\n",
      "Save model:  lm-large-cont-dim200.pth\n",
      "Epoch:15|Train Loss:1.4018|Train Acc:0.668941|Val Loss:3.90172|Val Acc:0.331231\n",
      "Save model:  lm-large-cont-dim200.pth\n",
      "Epoch:16|Train Loss:1.39323|Train Acc:0.67114|Val Loss:3.90542|Val Acc:0.331238\n",
      "Current lr: 7.8125e-05\n",
      "Epoch:17|Train Loss:1.3845|Train Acc:0.674491|Val Loss:3.90938|Val Acc:0.330333\n",
      "Current lr: 3.90625e-05\n",
      "Epoch:18|Train Loss:1.37252|Train Acc:0.678065|Val Loss:3.91194|Val Acc:0.331097\n",
      "Current lr: 1.953125e-05\n",
      "Epoch:19|Train Loss:1.36638|Train Acc:0.680176|Val Loss:3.91317|Val Acc:0.33096\n",
      "Current lr: 9.765625e-06\n",
      "Epoch:20|Train Loss:1.36329|Train Acc:0.681004|Val Loss:3.91378|Val Acc:0.330965\n"
     ]
    }
   ],
   "source": [
    "criterion = nn.NLLLoss()            # 指定损失函数\n",
    "optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)  # 指定优化器\n",
    "scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5)   # 学习率缩减？\n",
    "\n",
    "model_name = MODEL_PATH.format(EMBEDDING_DIM)\n",
    "LOG_INFO = 'Epoch:{}|Train Loss:{:.6}|Train Acc:{:.6}|Val Loss:{:.6}|Val Acc:{:.6}'\n",
    "\n",
    "SCHED_NUM = 0\n",
    "for epoch in range(1, EPOCHS+1):\n",
    "    train_loss, train_acc = train(model, DEVICE, train_batch, optimizer, criterion, GRAD_CLIP)\n",
    "    valid_loss, valid_acc = evaluate(model, DEVICE, dev_batch, criterion)\n",
    "    # 如果是测试集准确率有提升\n",
    "    if valid_acc > BEST_VALID_ACC: \n",
    "        BEST_VALID_ACC = valid_acc\n",
    "        torch.save(model, model_name)\n",
    "        print(\"Save model: \", model_name)\n",
    "        SCHED_NUM = 0\n",
    "    else:\n",
    "        SCHED_NUM += 1\n",
    "        scheduler.step()\n",
    "        print(\"Current lr:\", optimizer.param_groups[0]['lr'])\n",
    "        if SCHED_NUM == 7:\n",
    "            print(LOG_INFO.format(epoch, train_loss, train_acc, valid_loss, valid_acc))\n",
    "            print(\"Early stop!\")\n",
    "            break\n",
    "    print(LOG_INFO.format(epoch, train_loss, train_acc, valid_loss, valid_acc))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Test Loss: 3.91726 | Test Acc: 0.333578 |\n"
     ]
    }
   ],
   "source": [
    "model = torch.load(model_name)\n",
    "model = model.to(DEVICE)\n",
    "test_loss, test_acc = evaluate(model, DEVICE, test_batch, criterion)\n",
    "print('Test Loss: {:.6} | Test Acc: {:.6} |'.format(test_loss, test_acc))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "def print_pred_error_words(model,device,data_batch):\n",
    "    model.eval()\n",
    "    error_words = []\n",
    "    with torch.no_grad():\n",
    "        for c, x, y in data_batch:\n",
    "            c = c.to(device)\n",
    "            x = x.to(device)\n",
    "            y = y.to(device)\n",
    "            \n",
    "            mask = (y!=PAD_IDX)\n",
    "            fx = model(c, x)\n",
    "            \n",
    "            pred_idx = fx.argmax(dim=1)\n",
    "            ground_truth_idx = y[mask]\n",
    "            for p, g in zip(pred_idx.tolist(), ground_truth_idx.tolist()):\n",
    "                if p != g:\n",
    "                    error_words.append(\" | \".join([idx2word[g], idx2word[p]]))\n",
    "    return error_words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = torch.load(model_name)\n",
    "error_words = print_pred_error_words(model, DEVICE, test_batch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "真实值 | 预测值 | 预测错误次数\n",
      "('Bob | He', 88)\n",
      "('Sue | She', 54)\n",
      "('to | .', 44)\n",
      "('had | was', 38)\n",
      "('decided | was', 37)\n",
      "('Bob | She', 33)\n",
      "('and | .', 30)\n",
      "('. | to', 26)\n",
      "('her | the', 24)\n",
      "('in | .', 24)\n",
      "('Sue | He', 24)\n",
      "('for | .', 24)\n",
      "('his | the', 22)\n",
      "('She | He', 21)\n",
      "('He | She', 20)\n",
      "(', | .', 20)\n",
      "('the | his', 20)\n",
      "('. | and', 20)\n",
      "('the | .', 19)\n",
      "('His | He', 17)\n",
      "('a | the', 17)\n",
      "('went | was', 17)\n",
      "('the | her', 16)\n",
      "('Her | She', 16)\n",
      "('and | to', 16)\n",
      "(\"'s | was\", 15)\n",
      "('got | was', 15)\n",
      "('Sue | Bob', 15)\n",
      "('she | he', 14)\n",
      "('a | to', 14)\n",
      "('the | a', 14)\n",
      "('They | She', 14)\n",
      "('One | She', 14)\n",
      "('wanted | was', 14)\n",
      "('he | to', 14)\n"
     ]
    }
   ],
   "source": [
    "words_counter = Counter(error_words)\n",
    "TopN = 35\n",
    "topn_words = words_counter.most_common(TopN)\n",
    "print(\"真实值 | 预测值 | 预测错误次数\")\n",
    "for w in topn_words:\n",
    "    print(w)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
